### -------------------------------------------------- #
### ---- General Notes ---- 
### -------------------------------------------------- #


#' SUMMARY OF DEDUPLICATION VERSIONS:


#' 6) VERSION 6: Survey only
#' Use only zip code from survey as state source

#' 10) VERSION 10: Hybrid of application data
#' - Prioritize their "current address" (from the application) then their "university state" then their "cell phone".


### -------------------------------------------------- #
### ---- ASSIGN KEY VARS & SET UP ---- 
### -------------------------------------------------- #

### Clear global environment
rm(list=ls()) 

### Libraries:
pacman::p_unload(all)
pacman::p_load(pacman, tidyverse, data.table, lazyeval, magrittr, lubridate, 
               foreign, haven, readstata13 #for exporting to stata
               )

### -------------------------------------------------- #
### ---- FUN: Load & clean Data ---- 
### -------------------------------------------------- #

# Note: source of election dates is wikipedia 
# https://en.wikipedia.org/wiki/Election_Day_(United_States)

### Function: Load data and add date vars
load_data <- function(file_path, attempt_no)
{
  # Load combined dataset 
    load(file=paste0(file_path,"Temp_Data","/comb_matches_wDups_m",attempt_no,"_v2.RData") )
  
  # Save RegisteredParty as character (labels lost in merge)
    comb_tfa_matches %<>% mutate(RegisteredParty = as.character(RegisteredParty))
  
  # Reformat graduation Dates
    comb_tfa_matches %<>%
    # Reformat degree date (assume in 2000s if < 20)
      mutate(new_degreedate = as.Date(degreedate, "%d-%b-%y") ) %>%
      mutate(DegYear = strftime(new_degreedate, "19%y")) %>%
      mutate(DegYear = ifelse(as.numeric(DegYear)<1920,strftime(new_degreedate, "20%y"),DegYear)) %>%
      mutate(DegMonth = strftime(new_degreedate, "%m")) %>%
      mutate(DegDay = strftime(new_degreedate, "%d")) 
  # Note: Only three people missing degree year (IDs: 1342618, 2007387, 2131130)  
    
  # Add pre-post graduation indicators for each election  
    # Save election days (from wikipedia)
      elec_dates <-  list("16" = ymd("2016-11-08"), "15" = ymd("2015-11-03"), 
                          "14" = ymd("2014-11-04"), "13" = ymd("2013-11-05"),
                          "12" = ymd("2012-11-06"), "11" = ymd("2011-11-08"),
                          "10" = ymd("2010-11-02"), "09" = ymd("2009-11-03"),
                          "08" = ymd("2008-11-04"), "07" = ymd("2007-11-06"),
                          "06" = ymd("2006-11-07"), "05" = ymd("2005-11-08"),
                          "04" = ymd("2004-11-02"), "03" = ymd("2003-11-04"),
                          "02" = ymd("2002-11-05") )
    # Add post-election indicator 
    for( i in 1:length(elec_dates) ){
      y <- names(elec_dates)[i]
      add_date <- paste0('mutate(post_grad_',y,' = ifelse(elec_dates[[y]] - ymd(new_degreedate) > 0, 1, 0))')
      eval(parse( text=paste0('comb_tfa_matches %<>% ',add_date) ) )  
    }

  return(comb_tfa_matches)
}

### -------------------------------------------------- #
### ---- FUN: Calc No. of Matches ----
### -------------------------------------------------- #

### Function to summarize no. of obs with 0 or 2+ matches by applicant 
id_no_matches_app <- function(comb_tfa_matches, attempt_no)
{
  comb_tfa_matches %>%
    # Create indicator for match based on 'good' application sources 
    mutate(match_to_vf_app = ifelse(matched_to_vf==1 & total_sources_app > 0, 1, 0)) %>% 
    mutate(match_to_vf_nomat = ifelse(matched_to_vf==1 & total_sources_nomat > 0, 1, 0)) %>% 
    # Summarize number of instances of applicant in combined dataset
    group_by(personid) %>%
    summarise(no_of_matches = sum(match_to_vf_app, na.rm=T),
              no_of_matches_nomat = sum(match_to_vf_nomat, na.rm=T))

}

### Function to summarize distrib. of id_no_matches_app() 
sum_no_matches <- function(comb_tfa_matches, attempt_no)
{
  # Identify number of matches per applicant
  num_matches <- id_no_matches_app(comb_tfa_matches, attempt_no)
  # Summarize
  num_matches %>%
    # Summarize number of applicants by number of matches
    group_by(no_of_matches) %>%
    summarize(applicants = n()) %>% print()
  # Summarize
  num_matches %>%
    # Summarize number of applicants by number of matches
    group_by(no_of_matches_nomat) %>%
    summarize(applicants = n()) %>% print()  
}

### Function to add id_no_matches_app() as a variable 
calc_no_matches <- function(comb_tfa_matches, attempt_no)
{
  # Identify number of matches per applicant
  num_matches <- id_no_matches_app(comb_tfa_matches, attempt_no)
  # Add number of matches per applicant to combined dataset
  left_join(comb_tfa_matches, num_matches, by = "personid") 
}


### -------------------------------------------------- #
### ---- FUN: De-Duplicate Combined Datasets ----
### -------------------------------------------------- #

### -------------------------------------------------- #
### ** Save Dedups ----
### -------------------------------------------------- #

### Save finished dedup dataset 
save_dedup <- function(dat, attempt_no, dedup_no, file_path, file_date, convert_stata=TRUE)
{
  # Set file paths and deduped df name 
  fp_r <- paste0(file_path,"Temp_Data","/comb_matches_deDuped_m",attempt_no,"_v",dedup_no,".RData")
  fp_s <- paste0(file_path,"Temp_Data/",file_date,"_comb_tfa_matches_m",attempt_no,"_dedup",dedup_no,".dta")
  df_name <- paste0("comb_tfa_matches_dedup",dedup_no)
  # Rename df 
  eval(parse( text=paste0(df_name,' <- dat') ))
  # Save R dataset 
  eval(parse( text=paste0("save(",df_name,", file=fp_r)") ))
  # (Option) Save stata dataset 
  if(convert_stata==T) {
    names(dat) <- gsub("\\.", "_", names(dat))
    haven::write_dta(dat, fp_stata)
  }
}

### Open R deduped dataset and save as stata file 
open_r_convert_stata <- function(attempt_no, dedup_no, file_date, fp="")
{
  # Set file names 
  fn_r <- paste0(fp,"Temp_Data","/comb_matches_deDuped_m",attempt_no,"_v",dedup_no,".RData")
  fp_stata_p1 <- paste0(fp,"Temp_Data/",file_date,"_comb_tfa_matches_m")
  fp_stata <- paste0(fp_stata_p1,attempt_no,"_dedup",dedup_no,".dta")
  # Load dedup
  load(fn_r) 
  # Rename as 'dat'
  dn <- paste0('comb_tfa_matches_dedup', dedup_no)
  eval(parse( text=paste0('dat <- ',dn) ))
  eval(parse( text=paste0('rm(',dn,')') ))
  # Save as stata 
  names(dat) <- gsub("\\.", "_", names(dat))
  haven::write_dta(dat, fp_stata)
} 



### -------------------------------------------------- #
### ** Dedup 6: Single State Criteria ----
### -------------------------------------------------- #

### Dedup by selecting only one state criteria (for dedup 6)
select_one_flag <- function(dat, state_criteria=c("uni","pa","ca","st","ac1","cell","home"), vf_vars, other_match_vars)
{
  # Part 1: Filter for only those with one source's match
    flag_var <- paste0("vf_match_state_",state_criteria,"_flag")
    line1 <- paste0("dedup_sub1 <- dat %>% filter(",flag_var,"==1)")
    eval(parse( text=line1 ))
  
  # Part 2: Isolate, Code as Missing, and De-duplicate other matches
    dedup_sub0 <- dat %>%
    # Keep only applicants without a uni match
      filter(!personid %in% dedup_sub1$personid) %>%
    # Change vf & match variables to missing
      mutate_at(c(vf_vars,other_match_vars), funs({NA})) %>%
    # Only keep one of the duplicates
      distinct(personid, .keep_all=TRUE)
  
  # Combine  
    rbind(dedup_sub0, dedup_sub1)
}


### Dedup 6: Survey State Only
dedup_v6 <- function(comb_tfa_matches, attempt_no, vf_vars, other_match_vars, file_path, file_date, 
                     convert_stata=TRUE, return_dat=FALSE)
{
  # Run dedup function
    dat <- comb_tfa_matches %>% select_one_flag(state_criteria=c("st"), vf_vars, other_match_vars)
  # Save combined dataset 
    dat %>% save_dedup(attempt_no, dedup_no=6, file_path, file_date, convert_stata)
  # (Optional) Return dataset  
    if(return_dat==TRUE) dat
}


### -------------------------------------------------- #
### ** Dedup 10: Hybrid, no mat, dedup by elec ----
### -------------------------------------------------- #

### Function: Filter for only those with a particular match criteria, including vote history variables
gen_subset_flag_plusVH <- function(dat, state_criteria=c("uni","pa","ca","zc","ac1","cell","home"), YY)
{
  VH_range <- c(1:10)
  flag_var <- paste0("vf_match_state_",state_criteria,"_flag")
  line1 <- paste0("dedup_sub <- dat %>% filter( (",flag_var,"==1) & (VH",YY,"G %in% VH_range) )")
  eval(parse( text=line1 ))  
}

## Create subsets for election taking place pre- and post-graduation from college (duplicate matches only)
# Replace with subset if matched person voted & election pre-graduation 
merge_w_master_pre <- function(master, sub2_pre_YY, YY)
{
  new_matched_ids <- sub2_pre_YY$personid
  
  crit1 <- paste0("(post_grad_",YY,"==0)")
  crit2 <- paste0("(personid %in% new_matched_ids)")
  crit3 <- paste0("(no_of_matches>1)")
  crit <- paste0(crit1," & ",crit2," & ",crit3)
  out_true <- paste0("sub2_pre_YY[match(personid, sub2_pre_YY$personid), '",paste0("VH",YY,"G"),"']")
  out_false <- paste0("VH",YY,"G")
  statement <- paste0("mutate(VH",YY,"G = ifelse( ",crit,", ",out_true,", ",out_false,") )")
  
  eval(parse( text=paste0("master %<>% ",statement) ))  
}

# Replace with subset if matched person voted & election post-graduation 
merge_w_master_post <- function(master, sub2_post_YY, YY)
{
  new_matched_ids <- sub2_post_YY$personid
  
  crit1 <- paste0("(post_grad_",YY,"==1)") #changed
  crit2 <- paste0("(personid %in% new_matched_ids)")
  crit3 <- paste0("(no_of_matches>1)")
  crit <- paste0(crit1," & ",crit2," & ",crit3)
  out_true <- paste0("sub2_post_YY[match(personid, sub2_post_YY$personid), '",paste0("VH",YY,"G"),"']")
  out_false <- paste0("VH",YY,"G")
  statement <- paste0("mutate(VH",YY,"G = ifelse( ",crit,", ",out_true,", ",out_false,") )")
  
  eval(parse( text=paste0("master %<>% ",statement) ))  
}



### Dedup10
dedup_v10 <- function(comb_tfa_matches, attempt_no, vf_vars, other_match_vars, file_path, file_date, 
                      convert_stata=TRUE, return_dat=FALSE)
{
  # Create copy w/ numeric voting vars
  dat <- comb_tfa_matches
  dat %<>% mutate_at(vars(starts_with("VH")), as.numeric)
  
  # Create subset of applicants with 2+ matches
  sub2 <- dat %>% filter(no_of_matches>1)   
  
  # Create master dataset
  master <- dat %>%
    # Remove any non-app dups for personid w/ 1 app dup 
    filter(no_of_matches!=1 | no_of_matches==1 & matched_to_vf==1 & total_sources_app > 0) %>%
    # Change vf & match variables to missing for applicants with 2+ matches
    mutate_at(c(vf_vars,other_match_vars), funs(ifelse(no_of_matches>1, NA, .))) %>%
    # Only keep one of the duplicates
    distinct(personid, .keep_all=TRUE) %>%
    # For applicants with 2+ matches, set default value to zero 
    mutate_at(vars(starts_with("VH")), funs(ifelse(no_of_matches>1, 0, .))) 
  
  # Save vector w/ elec years
  YY_vec <- c("16","15","14","13","12","11","10","09","08","07","06","05","04","03","02")
  
  # Function: Pre-Graduation
  fun_pre_grad_d10 <- function(master, sub2, YY)
  {
    # Replace w/ 1st choice ("ca")
    # Get subset with state_criteria subset
    sub2_pre_YY <- gen_subset_flag_plusVH(sub2, state_criteria="ca", YY=YY)
    # Replace cells in master
    master %<>% merge_w_master_pre(sub2_pre_YY, YY)
    
    # Replace w/ 2nd choice ("uni")
    # Filter for unmatched 
    sub2 %<>% filter(!personid %in% sub2_pre_YY$personid)
    # Get subset with state_criteria subset
    sub2_pre_YY <- gen_subset_flag_plusVH(sub2, state_criteria="uni", YY=YY)
    # Replace cells in master
    master %<>% merge_w_master_pre(sub2_pre_YY, YY)      
    
    # Replace w/ 3nd choice ("cell") -- 
    # Filter for unmatched 
    sub2 %<>% filter(!personid %in% sub2_pre_YY$personid)
    # Get subset with state_criteria subset
    sub2_pre_YY <- gen_subset_flag_plusVH(sub2, state_criteria="cell", YY=YY)
    # Replace cells in master
    master %<>% merge_w_master_pre(sub2_pre_YY, YY)
    
    # Return altered master dataset
    return(master)
  }
  # Run pre-grad function
  for(i in 1:length(YY_vec)) master %<>% fun_pre_grad_d10(sub2, YY=YY_vec[i])
  
  # Function: Post-Graduation
  fun_post_grad_d10 <- function(master, sub2, YY)
  {
    # Replace w/ 1st choice ("ca")
    # Get subset with state_criteria subset    
    sub2_post_YY <- gen_subset_flag_plusVH(sub2, state_criteria="ca", YY=YY)
    # Replace cells in master    
    master %<>% merge_w_master_post(sub2_post_YY, YY)     
    
    # Replace w/ 2nd choice ("uni")
    # Filter for unmatched     
    sub2 %<>% filter(!personid %in% sub2_post_YY$personid)
    sub2_post_YY <- gen_subset_flag_plusVH(sub2, state_criteria="uni", YY=YY)
    master %<>% merge_w_master_post(sub2_post_YY, YY)  
    
    # Replace w/ 3rd choice ("cell") 
    # Filter for unmatched
    sub2 %<>% filter(!personid %in% sub2_post_YY$personid)
    sub2_post_YY <- gen_subset_flag_plusVH(sub2, state_criteria="cell", YY=YY)
    master %<>% merge_w_master_post(sub2_post_YY, YY)
    
    # Return altered master dataset
    return(master)
  }
  # Run post-grad function
  for(i in 1:length(YY_vec)) master %<>% fun_post_grad_d10(sub2, YY=YY_vec[i])
  
  # Save combined dataset 
  master %>% save_dedup(attempt_no, dedup_no=10, file_path, file_date, convert_stata)
  # (Optional) Return dataset  
  if(return_dat==TRUE) comb_tfa_matches_dedup
}


### -------------------------------------------------- #


### -------------------------------------------------- #
### ---- Function: Run Everything ---- 
### -------------------------------------------------- #

run_everything <- function(attempt_no, file_date, choose_dedup_nos=c(6,10), convert_stata=TRUE)
{
  # File Locations
  file_path <- ""
  # Variables from Voter File & Match Vars
  vf_vars <- c("DT_ID",  "DT_RegID",  "StateVoterID",  "State",  "NamePrefix",
               "FirstName_vf",  "MiddleName",  "LastName_vf",  "NameSuffix",  "Sex_vf",
               "BirthYear_vf", 
               "ModeledEthnicity",  "Race",  "CountyFIPS",  "PrecinctNumber",
               "PrecinctName",  "RegistrationAddress1",  "RegistrationAddress2",
               "RegistrationAddressZip5_vf",  "RegistrationAddressLatitude",
               "RegistrationAddressLongitude",  "MailingAddress1",
               "MailingAddress2",  "MailingAddressZip5_vf",  "LandLine_AreaCode_vf",  
               "LandLine_Number_vf",  "CellPhone_AreaCode_vf",  "CellPhone_Number_vf",
               "CellPhone_SourceCode",  "CellPhone_MatchLevel",
               "CellPhone_ReliabilityCode",  "LastActiveDate",  "RegistrationDate",
               "VoterStatus",  "PermanentAbsenteeFlag",  "VH16G",  "VH16P",  "VH16PP",
               "VH15G",  "VH15P",  "VH14G",  "VH14P",  "VH13G",  "VH13P",  "VH12G",  "VH12P",
               "VH12PP",  "VH11G",  "VH11P",  "VH10G",  "VH10P",  "VH09G",  "VH09P",  "VH08G",
               "VH08P",  "VH08PP",  "VH07G",  "VH07P",  "VH06G",  "VH06P",  "VH05G",  "VH05P",
               "VH04G",  "VH04P",  "VH04PP",  "VH03G",  "VH03P",  "VH02G",  "VH02P")
  other_match_vars <- c("vf_match_state_ca_flag",
                        "vf_match_state_uni_flag",
                        "vf_match_state_st_flag","vf_match_state_mr_flag",
                        "vf_match_state_ca",
                        "vf_match_state_cell_flag", "vf_match_state_cell",
                        "vf_match_state_uni","vf_match_state_st","vf_match_state_mr",
                        "total_sources_all",
                        "total_sources_sub", "total_sources_nomat", "total_sources_app",
                        "state_vf_matched", "no_of_matches", "no_of_matches_nomat",
                        "matched_to_vf")
  # Load Data
    comb_tfa_matches <- load_data(file_path, attempt_no)
  
  # Calc No. of Matches
    sum_no_matches(comb_tfa_matches, attempt_no)
    comb_tfa_matches <- calc_no_matches(comb_tfa_matches, attempt_no)
  
  # Function for generating arguments for dedup function
    gen_dedup_args <- function(dedup_no)
    {
      # Save as stata if selected as argument 
        fun_arg_stata <- 'convert_stata=FALSE'
        if(convert_stata==TRUE) fun_arg_stata <- 'convert_stata=TRUE'
      # Return arguments
        paste0('comb_tfa_matches, attempt_no, vf_vars, other_match_vars, file_path, file_date, ',fun_arg_stata)
    }
  
  # Function for running dedup function
    run_dedup <- function(dedup_no)
    {
      # Generate Arguments
        fun_args <- gen_dedup_args(dedup_no)
      # Run dedup code
        eval(parse( text=paste0('dedup_v',dedup_no,'(',fun_args,')') ) ) 
      # Print message when complete
        print(paste0("Completed dedup ",dedup_no," for match ",attempt_no,"."))
    }  
  
  # Loop over selected dedup numbers
    map(choose_dedup_nos, run_dedup)
}



### -------------------------------------------------- #
### ---- RUN ---- 
### -------------------------------------------------- #

# Set date for file names
set_file_date = "4-13"

# Set dedup versions
d <- c(6,10)


### -------------------------------- #
### *** Gen dedups ---- 
### -------------------------------- #

# Run dedups 
run_everything(attempt_no = 1, file_date = set_file_date, choose_dedup_nos=d, convert_stata=FALSE)

### -------------------------------- #
### *** Convert to Stata ---- 
### -------------------------------- #

map(d,
    function(x){
      open_r_convert_stata(attempt_no=1, dedup_no=x, file_date=set_file_date)
      print(paste0("Finished saving dedup ",x,"."))
    }
)
